# pip install openpyxl


import warnings
warnings.filterwarnings('ignore')


import pandas as pd
import numpy as np
import seaborn as sns
import seaborn as sb
%matplotlib inline
import matplotlib.pyplot as plt


data1 = pd.read_excel('dataset.xlsx', engine='openpyxl')


data1.head()


data1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5644 entries, 0 to 5643
Columns: 111 entries, Patient ID to ctO2 (arterial blood gas analysis)
dtypes: float64(70), int64(4), object(37)
memory usage: 4.8+ MB


data1.fillna(0)


data1.shape

(5644, 111)


data1 = data1.drop('Patient ID', axis=1)


code = {'negative':0,
        'positive':1, 
        'not_detected':0,
        'detected':1
       }


for col in data1.select_dtypes('object'):
    data1[col] = data1[col].map(code)


data1.dtypes.value_counts()

float64    105
int64        5
dtype: int64


def clean_dataset(data1):
    assert isinstance(df, pd.DataFrame), "data1 needs to be a pd.DataFrame"
    data1.dropna(inplace=True)
    indices_to_keep = ~data1.isin([np.nan, np.inf, -np.inf]).any(1)
    return data1[indices_to_keep].astype(np.float64)


data1.replace([np.inf, -np.inf], np.nan, inplace=True)


data1.fillna(999, inplace=True)


# Create X (features matrix)
X = data1.drop("SARS-Cov-2 exam result", axis=1)

# Create y (labels)
y = data1["SARS-Cov-2 exam result"]


# 2. Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# We'll keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}


# 3. Fit the model to the training data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


clf.fit(X_train, y_train);


X_train


y_preds = clf.predict(X_test)
y_preds

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)


y_test

2451    0
966     0
8       0
1350    0
4522    1
       ..
1693    0
1897    0
3652    0
3502    0
2347    0
Name: SARS-Cov-2 exam result, Length: 1129, dtype: int64


# 4. Evaluate the model on the training data and test data
clf.score(X_train, y_train)

0.9207087486157254


clf.score(X_test, y_test)

0.8937112488928255


from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.90      0.99      0.94      1009
           1       0.50      0.05      0.09       120

    accuracy                           0.89      1129
   macro avg       0.70      0.52      0.52      1129
weighted avg       0.86      0.89      0.85      1129


confusion_matrix(y_test, y_preds)

array([[1003,    6],
       [ 114,    6]], dtype=int64)


accuracy_score(y_test, y_preds)

0.8937112488928255


# 5. Improve a model
# Try different amount of n_estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 10 estimators...
Model accuracy on test set: 89.28%

Trying model with 20 estimators...
Model accuracy on test set: 89.55%

Trying model with 30 estimators...
Model accuracy on test set: 89.55%

Trying model with 40 estimators...
Model accuracy on test set: 89.28%

Trying model with 50 estimators...
Model accuracy on test set: 89.28%

Trying model with 60 estimators...
Model accuracy on test set: 89.55%

Trying model with 70 estimators...
Model accuracy on test set: 89.55%

Trying model with 80 estimators...
Model accuracy on test set: 89.28%

Trying model with 90 estimators...
Model accuracy on test set: 89.55%


# 6. Save a model and load it
import pickle

pickle.dump(clf, open("random_forst_model_1.pkl", "wb"))


loaded_model = pickle.load(open("random_forst_model_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

0.895482728077945


data1.head()


X = data1.drop("SARS-Cov-2 exam result", axis=1)
X.head()


y = data1["SARS-Cov-2 exam result"]
y.head()

0    0
1    0
2    0
3    0
4    0
Name: SARS-Cov-2 exam result, dtype: int64


# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.3)


X_train.shape, X_test.shape, y_train.shape, y_test.shape

((3950, 109), (1694, 109), (3950,), (1694,))


X.shape[0] * 0.8

4515.2


len(data1)

5644


# Import the LinearSVC estimator class
from sklearn.svm import LinearSVC

# Setup random seed
np.random.seed(42)

# Make the data
X = data1.drop("SARS-Cov-2 exam result", axis=1)
y = data1["SARS-Cov-2 exam result"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate LinearSVC
clf = LinearSVC(max_iter=10000)
clf.fit(X_train, y_train)

# Evaluate the LinearSVC
clf.score(X_test, y_test)

0.9034543844109831


data1["SARS-Cov-2 exam result"].value_counts()

0    5086
1     558
Name: SARS-Cov-2 exam result, dtype: int64


# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

# Make the data
X = data1.drop("SARS-Cov-2 exam result", axis=1)
y = data1["SARS-Cov-2 exam result"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Evaluate the Random Forest Classifier
clf.score(X_test, y_test)

0.9078830823737821


# Import the RandomForestClassifier estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

# Make the data
X = data1.drop("SARS-Cov-2 exam result", axis=1)
y = data1["SARS-Cov-2 exam result"]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100)

# Fit the model to the data (training the machine learning model)
clf.fit(X_train, y_train)

# Evaluate the Random Forest Classifier (use the patterns the model has learned)
clf.score(X_test, y_test)

0.9078830823737821


X.head()


y.tail()

5639    1
5640    0
5641    0
5642    0
5643    1
Name: SARS-Cov-2 exam result, dtype: int64


X_test.head()


clf.predict(X_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)


np.array(y_test)

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)


# Compare predictions to truth labels to evaluate the model
y_preds = clf.predict(X_test)
np.mean(y_preds == y_test)

0.9078830823737821


clf.score(X_test, y_test)

0.9078830823737821


from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

0.9078830823737821


# predict_proba() returns probabilities of a classification label 
clf.predict_proba(X_test[:5])

array([[0.95051218, 0.04948782],
       [0.99351432, 0.00648568],
       [0.80652724, 0.19347276],
       [0.86728542, 0.13271458],
       [0.86728542, 0.13271458]])


# Let's predict() on the same data...
clf.predict(X_test[:5])

array([0, 0, 0, 0, 0], dtype=int64)


X_test[:5]


data1["SARS-Cov-2 exam result"].value_counts()

0    5086
1     558
Name: SARS-Cov-2 exam result, dtype: int64


# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

0.09211691762621789


from sklearn.metrics import roc_curve
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier


np.random.seed(42)

X = data1.drop("SARS-Cov-2 exam result", axis=1)
y = data1["SARS-Cov-2 exam result"]

clf = RandomForestClassifier(n_estimators=100)
cross_val_score = cross_val_score(clf, X, y, cv=5)


np.mean(cross_val_score)

0.8998935227936604


print(f"Cross accuracy of Covid19 classifier: {np.mean(cross_val_score) *100:.2f}%")

Cross accuracy of Covid19 classifier: 89.99%


# Create X_test... etc
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


from sklearn.metrics import roc_curve

# Fit the classifier
clf.fit(X_train, y_train)

# Make predictions with probabilities
y_probs = clf.predict_proba(X_test)

y_probs[:10], len(y_probs)

(array([[0.86339148, 0.13660852],
        [0.85533253, 0.14466747],
        [0.94572144, 0.05427856],
        [0.86339148, 0.13660852],
        [0.84943862, 0.15056138],
        [1.        , 0.        ],
        [1.        , 0.        ],
        [0.85935234, 0.14064766],
        [0.94578416, 0.05421584],
        [0.88877406, 0.11122594]]),
 1129)


y_probs_positive = y_probs[:, 1]
y_probs_positive[:10]

array([0.13660852, 0.14466747, 0.05427856, 0.13660852, 0.15056138,
       0.        , 0.        , 0.14064766, 0.05421584, 0.11122594])


# Caculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

# Check the false positive rates
fpr

array([0.00000000e+00, 9.72762646e-04, 1.94552529e-03, 1.94552529e-03,
       4.86381323e-03, 4.86381323e-03, 6.80933852e-03, 6.80933852e-03,
       8.75486381e-03, 1.16731518e-02, 1.26459144e-02, 1.26459144e-02,
       1.36186770e-02, 1.45914397e-02, 1.65369650e-02, 1.75097276e-02,
       1.75097276e-02, 2.04280156e-02, 2.04280156e-02, 2.23735409e-02,
       2.23735409e-02, 2.33463035e-02, 2.52918288e-02, 2.91828794e-02,
       3.40466926e-02, 3.50194553e-02, 3.89105058e-02, 4.08560311e-02,
       4.28015564e-02, 4.57198444e-02, 4.66926070e-02, 4.76653696e-02,
       5.44747082e-02, 6.61478599e-02, 6.71206226e-02, 6.80933852e-02,
       1.11867704e-01, 1.13813230e-01, 1.17704280e-01, 1.21595331e-01,
       1.26459144e-01, 1.28404669e-01, 1.54669261e-01, 1.56614786e-01,
       1.57587549e-01, 1.86770428e-01, 2.32490272e-01, 2.65564202e-01,
       2.68482490e-01, 2.69455253e-01, 3.08365759e-01, 3.08365759e-01,
       3.11284047e-01, 3.12256809e-01, 3.18093385e-01, 3.44357977e-01,
       3.64785992e-01, 3.92023346e-01, 4.14396887e-01, 4.15369650e-01,
       4.20233463e-01, 4.69844358e-01, 4.76653696e-01, 4.79571984e-01,
       4.79571984e-01, 4.83463035e-01, 5.00972763e-01, 5.43774319e-01,
       5.71984436e-01, 5.72957198e-01, 5.77821012e-01, 6.10894942e-01,
       6.14785992e-01, 6.14785992e-01, 6.15758755e-01, 6.18677043e-01,
       6.18677043e-01, 6.22568093e-01, 6.23540856e-01, 6.57587549e-01,
       6.63424125e-01, 6.95525292e-01, 6.97470817e-01, 7.01361868e-01,
       7.08171206e-01, 7.10116732e-01, 7.13035019e-01, 7.15953307e-01,
       7.20817121e-01, 7.24708171e-01, 7.25680934e-01, 7.31517510e-01,
       7.33463035e-01, 7.34435798e-01, 7.41245136e-01, 7.45136187e-01,
       7.53891051e-01, 7.57782101e-01, 7.65564202e-01, 7.65564202e-01,
       7.67509728e-01, 7.69455253e-01, 7.69455253e-01, 7.70428016e-01,
       7.92801556e-01, 7.98638132e-01, 8.03501946e-01, 8.05447471e-01,
       8.08365759e-01, 8.12256809e-01, 1.00000000e+00])


# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr)
    and true positive rate (tpr) of a model.
    """
    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Plot line with no predictive power (baseline)
    #plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--", label="Guessing")
    
    # Customize the plot
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True positive rate (tpr)")
    plt.title("Receiver Operating Characteristic (ROC) Curve")
    plt.legend()
    plt.show()

plot_roc_curve(fpr, tpr)


from sklearn.metrics import roc_auc_score

roc_auc_score(y_test, y_probs_positive)

0.6459770004237779


# Plot perfect ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(y_test, y_test)
plot_roc_curve(fpr, tpr)


# Perfect AUC score
roc_auc_score(y_test, y_test)

1.0


from sklearn.metrics import confusion_matrix

y_preds = clf.predict(X_test)

confusion_matrix(y_test, y_preds)

array([[1022,    6],
       [  98,    3]], dtype=int64)


# Visualize confusion matrix with pd.crosstab()
pd.crosstab(y_test,
            y_preds,
            rownames=["Actual Labels"],
            colnames=["Predicted Labels"])


len(X_test)

1129


# Make our confusion matrix more visual with Seaborn's heatmap()
import seaborn as sns

# Set the font scale 
sns.set(font_scale=1.5)

# Create a confusion matrix
conf_mat = confusion_matrix(y_test, y_preds)

# Plot it using Seaborn
sns.heatmap(conf_mat);


def plot_conf_mat(conf_mat):
    """
    Plots a confusion matrix using Seaborn's heatmap().
    """
    fig, ax = plt.subplots(figsize=(3,3))
    ax = sns.heatmap(conf_mat,
                     annot=True, # Annotate the boxes with conf_mat info
                     cbar=False)
    plt.xlabel("Predicted label")
    plt.ylabel("True label")
    
    # Fix the broken annotations (this happened in Matplotlib 3.1.1)
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top-0.5);
    
plot_conf_mat(conf_mat)


from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(clf, X, y)

<sklearn.metrics._plot.confusion_matrix.ConfusionMatrixDisplay at 0x1a1c4d96d60>


from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.91      0.99      0.95      1028
           1       0.33      0.03      0.05       101

    accuracy                           0.91      1129
   macro avg       0.62      0.51      0.50      1129
weighted avg       0.86      0.91      0.87      1129


Comme vous pouvez le voir, il s'agit d'une classification binaire avec linearSVC. La classe 1 a une précision moins élevée que la classe 0 (- 58%), mais la classe 0 a un rappel plus élevé que la classe 1 (+ 11%). Comment interpréteriez-vous cela? Et 2 autres questions: que signifie «support»? les scores de précision et de rappel dans le rapport de classification sont différents par rapport à mes résultats de sklearn.metrics.precision_score ou rappel_score, pourquoi est-ce ainsi? : /


# Where precision and recall become valuable
disease_true = np.zeros(10000)
disease_true[0] = 1 # only one positive case

disease_preds = np.zeros(10000) # model predicts every case as 0

pd.DataFrame(classification_report(disease_true,
                                   disease_preds,
                                   output_dict=True))

	Patient ID	Patient age quantile	SARS-Cov-2 exam result	Hematocrit	Hemoglobin	Platelets	Mean platelet volume	...	Hb saturation (arterial blood gases)	pCO2 (arterial blood gas analysis)	Base excess (arterial blood gas analysis)	pH (arterial blood gas analysis)	Total CO2 (arterial blood gas analysis)	HCO3 (arterial blood gas analysis)	pO2 (arterial blood gas analysis)	Arteiral Fio2	Phosphor	ctO2 (arterial blood gas analysis)
0	44477f75e8169d2	13	negative	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	126e9dd13932f68	17	negative	0.236515	-0.02234	-0.517413	0.010677	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	a46b4402a0e5696	8	negative	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	f7d619a94f97c45	5	negative	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	d9e41465789c2b5	15	negative	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	Patient age quantile	Patient addmited to regular ward (1=yes, 0=no)	Patient addmited to semi-intensive unit (1=yes, 0=no)	Patient addmited to intensive care unit (1=yes, 0=no)	Hematocrit	Hemoglobin	Platelets	Mean platelet volume	Red blood Cells	Lymphocytes	...	Hb saturation (arterial blood gases)	pCO2 (arterial blood gas analysis)	Base excess (arterial blood gas analysis)	pH (arterial blood gas analysis)	Total CO2 (arterial blood gas analysis)	HCO3 (arterial blood gas analysis)	pO2 (arterial blood gas analysis)	Arteiral Fio2	Phosphor	ctO2 (arterial blood gas analysis)
1832	2	0	0	0	999.000000	999.00000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
3024	6	0	0	0	999.000000	999.00000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
1335	7	0	0	0	999.000000	999.00000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
261	11	0	0	0	999.000000	999.00000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
4545	9	0	0	0	-0.083925	-0.33562	0.33679	-0.774677	0.225417	-1.029223	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1901	8	0	0	0	999.000000	999.00000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
597	0	0	0	0	999.000000	999.00000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
4213	18	0	0	0	999.000000	999.00000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
1139	15	0	0	0	999.000000	999.00000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
5284	5	0	0	0	999.000000	999.00000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0

	Patient age quantile	Hematocrit	Hemoglobin	Platelets	Mean platelet volume	Red blood Cells	...	Hb saturation (arterial blood gases)	pCO2 (arterial blood gas analysis)	Base excess (arterial blood gas analysis)	pH (arterial blood gas analysis)	Total CO2 (arterial blood gas analysis)	HCO3 (arterial blood gas analysis)	pO2 (arterial blood gas analysis)	Arteiral Fio2	Phosphor	ctO2 (arterial blood gas analysis)
0	13	999.000000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
1	17	0.236515	-0.02234	-0.517413	0.010677	0.102004	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
2	8	999.000000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
3	5	999.000000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
4	15	999.000000	999.00000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0

	Patient age quantile	Hematocrit	Hemoglobin	Platelets	Mean platelet volume	Red blood Cells	Lymphocytes	...	Hb saturation (arterial blood gases)	pCO2 (arterial blood gas analysis)	Base excess (arterial blood gas analysis)	pH (arterial blood gas analysis)	Total CO2 (arterial blood gas analysis)	HCO3 (arterial blood gas analysis)	pO2 (arterial blood gas analysis)	Arteiral Fio2	Phosphor	ctO2 (arterial blood gas analysis)
0	13	999.000000	999.00000	999.000000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
1	17	0.236515	-0.02234	-0.517413	0.010677	0.102004	0.318366	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
2	8	999.000000	999.00000	999.000000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
3	5	999.000000	999.00000	999.000000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
4	15	999.000000	999.00000	999.000000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0

	Patient age quantile	Hematocrit	Hemoglobin	Platelets	Mean platelet volume	Red blood Cells	Lymphocytes	...	Hb saturation (arterial blood gases)	pCO2 (arterial blood gas analysis)	Base excess (arterial blood gas analysis)	pH (arterial blood gas analysis)	Total CO2 (arterial blood gas analysis)	HCO3 (arterial blood gas analysis)	pO2 (arterial blood gas analysis)	Arteiral Fio2	Phosphor	ctO2 (arterial blood gas analysis)
0	13	999.000000	999.00000	999.000000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
1	17	0.236515	-0.02234	-0.517413	0.010677	0.102004	0.318366	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
2	8	999.000000	999.00000	999.000000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
3	5	999.000000	999.00000	999.000000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
4	15	999.000000	999.00000	999.000000	999.000000	999.000000	999.000000	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0

Diagnosis of COVID-19 and its clinical spectrum¶

Data Sciences and Working Approach¶

Basic checklist (not exhaustive)¶

Pre-processing¶

For an entire DataFrame using Pandas:¶

1. Getting our data ready to be used with machine learning¶

Choosing an estimator for a classification problem¶

Fitting the model to the data¶

Make predictions using a machine learning model¶

Confusion Matrix¶

Classification Report¶

	Patient ID	Patient age quantile	SARS-Cov-2 exam result	Patient addmited to regular ward (1=yes, 0=no)	Patient addmited to semi-intensive unit (1=yes, 0=no)	Patient addmited to intensive care unit (1=yes, 0=no)	Hematocrit	Hemoglobin	Platelets	Mean platelet volume	...	Hb saturation (arterial blood gases)	pCO2 (arterial blood gas analysis)	Base excess (arterial blood gas analysis)	pH (arterial blood gas analysis)	Total CO2 (arterial blood gas analysis)	HCO3 (arterial blood gas analysis)	pO2 (arterial blood gas analysis)	Arteiral Fio2	Phosphor	ctO2 (arterial blood gas analysis)
0	44477f75e8169d2	13	negative	0	0	0	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
1	126e9dd13932f68	17	negative	0	0	0	0.236515	-0.022340	-0.517413	0.010677	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
2	a46b4402a0e5696	8	negative	0	0	0	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
3	f7d619a94f97c45	5	negative	0	0	0	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
4	d9e41465789c2b5	15	negative	0	0	0	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
5639	ae66feb9e4dc3a0	3	positive	0	0	0	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5640	517c2834024f3ea	17	negative	0	0	0	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5641	5c57d6037fe266d	4	negative	0	0	0	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5642	c20c44766f28291	10	negative	0	0	0	0.000000	0.000000	0.000000	0.000000	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0
5643	2697fdccbfeb7f7	19	positive	0	0	0	0.694287	0.541564	-0.906829	-0.325903	...	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0

	Patient age quantile	Hematocrit	Hemoglobin	Platelets	Mean platelet volume	Red blood Cells	Lymphocytes	...	Hb saturation (arterial blood gases)	pCO2 (arterial blood gas analysis)	Base excess (arterial blood gas analysis)	pH (arterial blood gas analysis)	Total CO2 (arterial blood gas analysis)	HCO3 (arterial blood gas analysis)	pO2 (arterial blood gas analysis)	Arteiral Fio2	Phosphor	ctO2 (arterial blood gas analysis)
1694	17	999.0	999.0	999.0	999.0	999.0	999.0	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
4434	14	999.0	999.0	999.0	999.0	999.0	999.0	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
3297	8	999.0	999.0	999.0	999.0	999.0	999.0	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
3980	5	999.0	999.0	999.0	999.0	999.0	999.0	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0
4165	5	999.0	999.0	999.0	999.0	999.0	999.0	...	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0	999.0

	0.0	1.0	accuracy	macro avg	weighted avg
precision	0.99990	0.0	0.9999	0.499950	0.99980
recall	1.00000	0.0	0.9999	0.500000	0.99990
f1-score	0.99995	0.0	0.9999	0.499975	0.99985
support	9999.00000	1.0	0.9999	10000.000000	10000.00000

Predicted Labels	0	1
Actual Labels
0	1022	6
1	98	3